Set-up packages // libraries and prepare for data import:
## Uncomment install.packages and run outside of notebook environment:
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic", "modelr", "broom"))
library(psych)
library(likert)
Loading required package: ggplot2
Attaching package: ‘ggplot2’
The following objects are masked from ‘package:psych’:
%+%, alpha
Loading required package: xtable
library(jsonlite)
library(ggplot2)
library(plotly)
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(modelr)
Attaching package: ‘modelr’
The following object is masked from ‘package:psych’:
heights
library(broom)
Attaching package: ‘broom’
The following object is masked from ‘package:modelr’:
bootstrap
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
Set-up directories, import and clean data:
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R")
getwd()
[1] "/Users/allieblaising/Desktop/bang/R"
dataPath = "../.data"
## Define function to extract survey results:
extractSurvey = function(frame,survey) {
rounds = seq(1,length(frame$results.format[[1]]))
roundResponses = lapply(rounds, function(round) {
getCol = paste("results.",survey,".",round, sep="")
surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
surveyFrame = frame[,surveyCols]
if (is.null(newCols)) {return("No newCols")}
names(surveyFrame) = newCols
surveyFrame$id = frame$id
surveyFrame$round = round
surveyFrame$batch = frame$batch
surveyFrame$rooms = frame$rooms
surveyFrame$manipulation = frame$results.manipulationCheck
surveyFrame$blacklist = frame$results.blacklistCheck
return(surveyFrame)
})
return(Reduce(rbind, roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below):
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) {
if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
return(any(batchData$batchComplete == TRUE))
}
return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector=TRUE)
return(flatten(userFile, recursive = TRUE))
})
## Retroactively find rooms from chat data:
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
roomsForIndividual = lapply(seq(1,3),function(y) {
x$room = x$rooms[y]
x$round = y
return(x)
})
return(Reduce(rbind, roomsForIndividual))
})
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mdplyr [30m 0.7.6
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mggplot2[30m::[32m%+%()[30m masks [34mpsych[30m::%+%()
[31m✖[30m [34mggplot2[30m::[32malpha()[30m masks [34mpsych[30m::alpha()
[31m✖[30m [34mbroom[30m::[32mbootstrap()[30m masks [34mmodelr[30m::bootstrap()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mplotly[30m::filter(), [34mstats[30m::filter()
[31m✖[30m [34mpurrr[30m::[32mflatten()[30m masks [34mjsonlite[30m::flatten()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mdplyr[30m::[32mrecode()[30m masks [34mlikert[30m::recode()[39m
library(mosaic)
Loading required package: lattice
Loading required package: ggformula
New to ggformula? Try the tutorials:
learnr::run_tutorial("introduction", package = "ggformula")
learnr::run_tutorial("refining", package = "ggformula")
Attaching package: ‘ggformula’
The following object is masked from ‘package:modelr’:
na.warn
Loading required package: mosaicData
Loading required package: Matrix
Attaching package: ‘Matrix’
The following object is masked from ‘package:tidyr’:
expand
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
Attaching package: ‘mosaic’
The following object is masked from ‘package:Matrix’:
mean
The following objects are masked from ‘package:dplyr’:
count, do, tally
The following object is masked from ‘package:purrr’:
cross
The following object is masked from ‘package:modelr’:
resample
The following object is masked from ‘package:plotly’:
do
The following object is masked from ‘package:ggplot2’:
stat
The following objects are masked from ‘package:psych’:
logit, read.file, rescale
The following objects are masked from ‘package:stats’:
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test, quantile, sd, t.test, var
The following objects are masked from ‘package:base’:
max, mean, min, prod, range, sample, sum
## To filter the right batch numbers, add the first batch number for our runs (i.e. batch >= "first batch #")
## overlappingFiles <- overlappingFiles %>% filter(batch=="1536132233074")
More cleaning before visualizations:
## Apply extract survey function to extract the right columns and rows for viability survey:
# overlappingFiles <- overlappingFiles %>% filter(batch>="1536276904547")
# qFifteen <- as.data.frame(extractSurvey(overlappingFiles, 'qFifteenCheck'))
# qSixteen <- as.data.frame(extractSurvey(overlappingFiles, 'qSixteenCheck'))
# viabilitySurvey <- as.data.frame(extractSurvey(overlappingFiles, 'viabilityCheck'))
# qFifteen <- qFifteen %>% select(id, results.qFifteenCheck)
# qSixteen <- qSixteen %>% select(id, results.qSixteenCheck)
# qFifteen$id <- unlist(qFifteen$id)
# qSixteen$id <- unlist(qSixteen$id)
# postSurveyQs <- cbind(qFifteen, qSixteen)
# frame <- cbind(viabilitySurvey, postSurveyQs)
# frame <- frame[, !duplicated(colnames(frame))]
frame <- extractSurvey(overlappingFiles, 'viabilityCheck')
## Reduce to vertically combine rows in roundwithRooms list:
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe, use manipulation check b/c required for complete observation:
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge:
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe:
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds:
data2 = data2 %>% select(id, batch, room, bonus, name, friends,
friends_history, results.condition, results.format,
results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (**this should be simplified**)
data2$batch <- unlist(data2$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id)
data$batch <- unlist(data$batch)
## Before merge, data and data2 should have the same # of observations
## Merge columns by id, round and batch #s:
data <- left_join(data, data2, by=NULL)
Joining, by = c("id", "round", "batch")
## Subset only observations with batch #s in complete batches
allConditions <- data[data$batch %in% completeBatches, ]
Conditionally assign conditions based on treatment and results column:
## Messy, but robust? Verify, verify, verify people:
data <- data %>% mutate(
condition = case_when(
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A",
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B",
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A",
results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B",
results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
results.condition=='baseline' & results.format=="1:3" & round==1 ~ "A" ,
results.condition=='baseline' & results.format=="1:3" & round==2 ~ "B" ,
results.condition=='baseline' & results.format=="1:3" & round==3 ~ "C"
))
Set-up for factors for viability questions:
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off):
data <- na.omit(data)
## Factor for visualizations:
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree")
clean <- data %>%
mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels)))
## Create a new dataframe that converts factors to numeric for statistical analyses:
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
stats$sum[i] <- sum(stats[i,1:14])
}
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping:
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
group_by(room, batch, round, condition, results.condition, results.format) %>% ## to-do: add group ID in storage db.
summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n, groupID=runif(1,0,2)) %>%
## Filter out all teams with n=1 (i.e. a person in a one person team)
filter(n==2)
## Individual proportion:
individualProportion <- stats %>% group_by(round, batch, room) %>%
mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>%
filter(n>1)
## Table showing how many teams we've run in each condition combination:
print(table(groupedProportion$condition, groupedProportion$results.condition))
baseline control treatment
A 26 10 25
Ap 0 10 25
B 20 14 22
C 29 0 0
Probability of fracture across teams and condition combinations:
left_join(baselineFracture1, baselineFracture2, by=NULL, drop=TRUE)
Joining, by = c("room", "batch", "round", "condition", "results.condition", "results.format", "n", "mean", "median", "prop", "groupID", "fracture")
Section #1: Does fracture have continuity with prior measures?
1A: Report the % of correct guesses on the manipulation check:
## Treatment manipulation:
treatmentManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="treatment") %>%
summarise(n=n())
## If user manipulation answers == manipulation answer key then add 1
treatmentManipulation$correctAnswers <- sum(ifelse(treatmentManipulation$manipulationAnswers==treatmentManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for treatment:
treatmentManipulation <- treatmentManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(treatmentManipulation))))
## Control manipulation:
controlManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="control") %>%
summarise(n=n())
## If user manipulation answers == manipulation answer key then add 1
controlManipulation$correctAnswers <- sum(ifelse(controlManipulation$manipulationAnswers==controlManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for control:
controlManipulation <- controlManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(controlManipulation))))
## Proportion test comparing treatment manipulation percent correct with chance:
1B: Proportion test comparing treatment manipulation percent correct with chance:
## ⅓ ~ 33% = chance, ~43 = treatment (FILL-IN EACH TIME YOU RUN WITH NEW DATA!)
propManipulation <- prop.test(x = c(33, 43), n = c(100, 100))
print(propManipulation)
2-sample test for equality of proportions with continuity correction
data: c(33, 43) out of c(100, 100)
X-squared = 1.719, df = 1, p-value = 0.1898
alternative hypothesis: two.sided
95 percent confidence interval:
-0.24382407 0.04382407
sample estimates:
prop 1 prop 2
0.33 0.43
1C: Logistic regression predicting binary fracture outcome from viability scales:
## Split data into 60% training and 40% testing data sets to test how well the model performs
set.seed(123)
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)
sample <- sample(c(TRUE, FALSE), nrow(groupedProportionFracture), replace = T, prob = c(0.6,0.4))
train <- groupedProportionFracture[sample, ]
test <- groupedProportionFracture[!sample, ]
## Simple logistic regression: we will fit a logistic regression model in order to predict
## the probability of fracture based on a team's average viability sum:
model1 <- glm(fracture ~ mean, family = "binomial", data = train)
summary(model1)
Call:
glm(formula = fracture ~ mean, family = "binomial", data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.94811 -0.46471 0.04798 0.38542 2.43161
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 13.79411 2.58357 5.339 9.34e-08 ***
mean -0.26503 0.04908 -5.400 6.68e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 152.165 on 109 degrees of freedom
Residual deviance: 71.765 on 108 degrees of freedom
AIC: 75.765
Number of Fisher Scoring iterations: 6
## To assess the linear regression deviance, look at deviance in summary output, if
## deviance == sum of sqaures in linear regression, null deviance == difference between
## a model with only the intercept ("no mean predictors") and the a saturated model
## a model with a theoretically perfect fit. Model deviance (residual deviance) should be lower
## small values == better fit.
tidy(model1)
## Coefficient estimtes from log regression characterize relationship between the predictor and
## response variable on a log-odds scale, so binary increase from no fracture - fracture can be interpreted as associated with a decrease in mean viability sum.
## More coefficient output: measure the confidence intervals and accuracy of the coefficent:
confint(model1)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) 9.3617399 19.6211689
mean -0.3757527 -0.1807974
## Making predictions:
## What is the probability of fracture given the following team mean viability scores: for example purposes: 40 and 65:
predict(model1, data.frame(mean = c(50, 65)), type = "response")
1 2
0.63238720 0.03127928
## From the output, we can see that the probability of fracture decreases by ~40% when mean viability sum increases from 40 to 65.
## Model evaluation & diagnostics:
## How well does the model fit the data? And how accurate are the predictions on an out-of-sample data set?
## Residul assessment:
model1_data <- augment(model1) %>%
mutate(index = 1:n())
ggplot(model1_data, aes(index, .std.resid, color = mean)) +
geom_point(alpha = .5) +
geom_ref_line(h = 3)

## Validation of predicted values:
## How well does the model perform when predicting the target variable on out-of-sample observations?
test.predicted.m1 <- predict(model1, newdata = test, type = "response")
## Classification performance for each model on the test data. Output gives us a list of true / false positives:
list(
model1 = table(test$mean, test.predicted.m1 > 0.5) %>% prop.table() %>% round(3))
$model1
FALSE TRUE
32 0.000 0.014
32.5 0.000 0.014
36 0.000 0.028
36.5 0.000 0.014
37 0.000 0.014
41.5 0.000 0.028
44 0.000 0.014
45 0.000 0.014
46.5 0.000 0.028
47 0.000 0.014
48.5 0.000 0.042
49 0.000 0.028
49.5 0.000 0.014
50 0.000 0.028
50.5 0.000 0.014
52 0.000 0.028
52.5 0.014 0.000
53 0.028 0.000
54 0.028 0.000
54.5 0.014 0.000
55.5 0.028 0.000
56 0.099 0.000
56.5 0.014 0.000
57 0.028 0.000
58 0.028 0.000
58.5 0.042 0.000
59 0.014 0.000
59.5 0.014 0.000
60 0.014 0.000
60.5 0.014 0.000
61 0.014 0.000
61.5 0.028 0.000
62 0.028 0.000
62.5 0.014 0.000
63 0.056 0.000
64 0.014 0.000
65 0.014 0.000
66 0.014 0.000
66.5 0.014 0.000
69 0.014 0.000
70 0.070 0.000
table(test$mean, test.predicted.m1 > 0.5)
FALSE TRUE
32 0 1
32.5 0 1
36 0 2
36.5 0 1
37 0 1
41.5 0 2
44 0 1
45 0 1
46.5 0 2
47 0 1
48.5 0 3
49 0 2
49.5 0 1
50 0 2
50.5 0 1
52 0 2
52.5 1 0
53 2 0
54 2 0
54.5 1 0
55.5 2 0
56 7 0
56.5 1 0
57 2 0
58 2 0
58.5 3 0
59 1 0
59.5 1 0
60 1 0
60.5 1 0
61 1 0
61.5 2 0
62 2 0
62.5 1 0
63 4 0
64 1 0
65 1 0
66 1 0
66.5 1 0
69 1 0
70 5 0
1D: Graph of fracture/no fracture vs. mean/stdev viability scale
## Overall:
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 across all conditions (n=2)",
x="",
y="Numeric sum of viability measures questions (range: 7-70)")

## By condition + format:
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 (n=2)",
x="",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition)

ggplot(data=groupedProportionFracture, aes(fracture, mean)) +
geom_point() +
stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums across all conditions (n=2)") + facet_grid(condition ~ results.condition)

ggplot(data=groupedProportionFracture, aes(fracture, mean)) +
geom_point() +
stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums (n=2)") + facet_grid(condition ~ results.condition)

Section #2: How often does fracture occur?
2A: P(binary fracture) histogram of fracture proportion (by team)
## First make sure you've made ABC condition
ggplot(data=groupedProportion, aes(groupedProportion$prop)) +
geom_histogram(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Frequency of team fracture proportions by condition and pattern sequence",
x="team fracture proportion", y="Count") + facet_grid(results.condition ~ .)


library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
The following object is masked from ‘package:plotly’:
select
## Test if whether fracture smoking habit is independent of condition at .05 significance level:
chi = table(groupedProportionFracture$fracture, groupedProportionFracture$results.condition)
chi
baseline control treatment
0 41 18 38
1 34 16 34
print(chisq.test(chi))
Pearson's Chi-squared test
data: chi
X-squared = 0.059809, df = 2, p-value = 0.9705
2B: What’s the overall % of fracturing the second time? (by condition)
## Filtering second time only:
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)
## TO-DO: review below code, worried it's incorrect:
groupedProportionFractureSecond <- groupedProportionFracture %>% group_by(results.condition) %>%
filter(condition=="Ap") %>%
mutate(overallPercent = sum(as.numeric(fracture))) %>%
summarise(n=n(), overallPercent=unique(overallPercent)/n)
print(groupedProportionFractureSecond)
## For baseline theoretical distribution: (to-do: look over this, worried also that it's wrong)
groupedProportionFractureSecondBase <- groupedProportionFractureBaseline %>%
filter(round=="3") %>%
summarise(n=n(), overallPercent=sum(as.numeric(fracture))/n)
print(mean(groupedProportionFractureSecondBase$overallPercent))
[1] 0.4930556
## Set-up separate groups for proportion tests to answer: does unmasked fracture more/less than masked
## and more/less than new pairs?
## Control:
## Proportion tests for each (TO-DO, figure out argument stuff: this doesn't seem like the right test to run for the question we're asking?) also, fill-this in every time you get new data:
## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. treatment + baseline)
treatmentvsBaselineChange <- c(48, 49)
propFractureChangeTreatmentvsBase <- prop.test(x = c(treatmentvsBaselineChange), n = c(100, 100))
print(propFractureChangeTreatmentvsBase)
2-sample test for equality of proportions with continuity correction
data: c(treatmentvsBaselineChange) out of c(100, 100)
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
-0.1585211 0.1385211
sample estimates:
prop 1 prop 2
0.48 0.49
## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. control + baseline)
controlvsBaselineChange <- c(40, 49)
propFractureChangeControlvsBase <- prop.test(x = c(controlvsBaselineChange), n = c(100, 100))
print(propFractureChangeControlvsBase)
2-sample test for equality of proportions with continuity correction
data: c(controlvsBaselineChange) out of c(100, 100)
X-squared = 1.2957, df = 1, p-value = 0.255
alternative hypothesis: two.sided
95 percent confidence interval:
-0.23718348 0.05718348
sample estimates:
prop 1 prop 2
0.40 0.49
Section 3: How consistent is fracture?
3A: calculating the conditional probabilities of fracture for each condition:
controlFracture12 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured only 1
controlFractureOnly1 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture)
# of groups who fractured only 2
controlFractureOnly2 <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured never
controlFractureNever <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture)
conditionalProbControl <- matrix(c(controlFracture12, controlFractureOnly1, controlFractureOnly2, controlFractureNever),ncol=2,byrow=TRUE)
colnames(conditionalProbControl) <- c("Ap frac","Ap no frac")
rownames(conditionalProbTreatment) <- c("A frac","A no frac")
conditionalProbControl <- as.table(conditionalProbControl)
conditionalProbControl
Ap frac Ap no frac
A 0.3 0.0
B 0.1 0.6
3B: Now investigating the big result: switch %: what’s the % of pairs flipping their decisions?
controlFractureAbsSum <- sum(controlFracture$absFracture)
print(controlFractureAbsSum)
[1] 1
---
title: "Asplode test outline"
output: html_notebook
---

## Set-up packages // libraries and prepare for data import: 
```{r}
## Uncomment install.packages and run outside of notebook environment: 
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic", "modelr", "broom"))

library(psych)
library(likert)
library(jsonlite)
library(ggplot2)
library(plotly)
library(modelr)
library(broom)
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
```

## Set-up directories, import and clean data: 
```{r}
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R") 
getwd()
dataPath = "../.data"
## Define function to extract survey results: 
extractSurvey = function(frame,survey) {
  rounds = seq(1,length(frame$results.format[[1]]))
  roundResponses = lapply(rounds, function(round) {
    getCol = paste("results.",survey,".",round, sep="")
    surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
    newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
    surveyFrame = frame[,surveyCols]
    if (is.null(newCols)) {return("No newCols")}
    names(surveyFrame) = newCols
    surveyFrame$id = frame$id
    surveyFrame$round = round
    surveyFrame$batch = frame$batch
    surveyFrame$rooms = frame$rooms
    surveyFrame$manipulation = frame$results.manipulationCheck
    surveyFrame$blacklist = frame$results.blacklistCheck
    return(surveyFrame)
  })
  return(Reduce(rbind, roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below): 
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) { 
  if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
    batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
    return(any(batchData$batchComplete == TRUE))
  } 
  return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
  userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector=TRUE)
  return(flatten(userFile, recursive = TRUE))
})

## Retroactively find rooms from chat data: 
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
  roomsForIndividual = lapply(seq(1,3),function(y) {
    x$room = x$rooms[y]
    x$round = y
    return(x)
  })
  return(Reduce(rbind, roomsForIndividual))
})
library(tidyverse)
library(mosaic)

## To filter the right batch numbers, add the first batch number for our runs (i.e. batch >= "first batch #")  
## overlappingFiles <- overlappingFiles %>% filter(batch=="1536132233074")

```

## More cleaning before visualizations: 
```{r}
## For when we start inputting new batches only: 
# overlappingFiles <- overlappingFiles %>% filter(batch>="1536276904547") 
# qFifteen <- as.data.frame(extractSurvey(overlappingFiles, 'qFifteenCheck'))
# qSixteen <- as.data.frame(extractSurvey(overlappingFiles, 'qSixteenCheck'))
# viabilitySurvey <- as.data.frame(extractSurvey(overlappingFiles, 'viabilityCheck'))  
# qFifteen <- qFifteen %>% select(id, results.qFifteenCheck)
# qSixteen <- qSixteen %>% select(id, results.qSixteenCheck) 
# qFifteen$id <-  unlist(qFifteen$id)
# qSixteen$id <-  unlist(qSixteen$id)
# postSurveyQs <- cbind(qFifteen, qSixteen) 
# frame <- cbind(viabilitySurvey, postSurveyQs)  
# frame <- frame[, !duplicated(colnames(frame))]

frame <- extractSurvey(overlappingFiles, 'viabilityCheck')
## Reduce to vertically combine rows in roundwithRooms list:  
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe, use manipulation check b/c required for complete observation: 
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge: 
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe: 
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds: 
data2 = data2 %>% select(id, batch, room, bonus, name, friends, 
                         friends_history, results.condition, results.format,
                  results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (**this should be simplified**)
data2$batch <- unlist(data2$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id) 
data$batch <- unlist(data$batch)

## Before merge, data and data2 should have the same # of observations
## Merge columns by id, round and batch #s: 
data <- left_join(data, data2, by=NULL)
## Subset only observations with batch #s in complete batches 
allConditions <- data[data$batch %in% completeBatches, ]
```
## Conditionally assign conditions based on treatment and results column: 
```{r}
## Messy, but robust? Verify, verify, verify people:   
data <- data %>% mutate(
  condition = case_when(
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='baseline' & results.format=="1:3" & round==1 ~ "A" ,
    results.condition=='baseline' & results.format=="1:3" & round==2 ~ "B" ,
    results.condition=='baseline' & results.format=="1:3" & round==3 ~ "C" 
  )) 

```

## Set-up for factors for viability questions: 
```{r}
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off): 
data <- na.omit(data)
## Factor for visualizations: 
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree") 
clean <- data %>% 
  mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels))) 
## Create a new dataframe that converts factors to numeric for statistical analyses:
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
  stats$sum[i] <- sum(stats[i,1:14])                          
} 
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping: 
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
  group_by(room, batch, round, condition, results.condition, results.format) %>%  ## to-do: add group ID in storage db. 
  summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n, groupID=runif(1,0,2)) %>% 
## Filter out all teams with n=1 (i.e. a person in a one person team) 
  filter(n==2)

## Individual proportion: 
individualProportion <- stats %>% group_by(round, batch, room) %>% 
  mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% 
  filter(n>1)

## Table showing how many teams we've run in each condition combination:  
print(table(groupedProportion$condition, groupedProportion$results.condition)) 
```

## Probability of fracture across teams and condition combinations: 
```{r}
## Define and initialize cut-off point for fracture: 
groupedProportionFracture <- groupedProportion %>%
  group_by(results.condition, condition) %>% 
  mutate(fracture = case_when (prop<.50 ~ "0", 
                                prop>=.50~ "1")) 

## Baseline: (we probably won't need this since we are artificially making baseline conditions)
# groupedProportionFractureBaseline <- 
#   groupedProportionFracture %>% 
# filter(results.condition=="baseline") %>% 
#   filter(id %in% condition=="A" & condition=="C")
# baselineFracture1 <- groupedProportionFractureBaseline %>% filter(condition=="A")
# baselineFracture2 <- groupedProportionFractureBaseline %>% filter(condition=="C")

# TO-DO: check if this is right--seems off (why is baseline off?)

groupedProportionFractureBaseline <- groupedProportionFracture %>% filter(results.format=="c(1, 1, 2)" | results.format=="c(2, 1, 1)") %>% filter(round=="1" | round=="3") 
baselineFracture1 <- groupedProportionFractureBaseline %>% filter(round=="1")
baselineFracture2 <- groupedProportionFractureBaseline %>% filter(round=="3")
baselineFracture1$fracture1 <- baselineFracture1$fracture
baselineFracture2$fracture2 <- baselineFracture2$fracture
## Below only combines if teams are in both round 1 & 3 together: 

## Treatment: 
groupedProportionFractureTreatment <- groupedProportionFracture %>% 
filter(results.condition=="treatment")
treatmentFracture1 <- groupedProportionFractureTreatment %>% filter(condition=="A")
treatmentFracture2 <- groupedProportionFractureTreatment %>% filter(condition=="Ap")
treatmentFracture <- cbind(treatmentFracture1, treatmentFracture2)

## Control:  
groupedProportionFractureControl <- groupedProportionFracture %>% 
  filter(results.condition=="control") 
controlFracture1 <- groupedProportionFractureControl %>% filter(condition=="A")
controlFracture2 <- groupedProportionFractureControl %>% filter(condition=="Ap")
controlFracture <- cbind(controlFracture1, controlFracture2)

## Manipulation check: 
## Use individual proportion data frame b/c we're interested in looking at individuals that were in teams >n=1
teamPatterns <- "Team 1 and Team 2|Team 1 and Team 3|Team 2 and Team 3"
individualProportion$manipulationAnswerKey <- str_extract(individualProportion$results.manipulationCheck,teamPatterns) 
individualProportion$manipulationAnswers <- individualProportion$results.manipulation
individualProportion$results.condition = unlist(individualProportion$results.condition)
## Omit NAs and results that can't be processed: 
cleanManipulation <- individualProportion[with(individualProportion, grepl(teamPatterns, manipulationAnswers) & grepl(teamPatterns, manipulationAnswerKey)),]
## Transform into compatiable form: 
cleanManipulation$manipulationAnswers = unlist(cleanManipulation$manipulationAnswers)
```

## Section #1: Does fracture have continuity with prior measures?

## 1A: Report the % of correct guesses on the manipulation check: 
```{r}
## Treatment manipulation: 
treatmentManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="treatment") %>% 
  summarise(n=n()) 
## If user manipulation answers == manipulation answer key then add 1 
treatmentManipulation$correctAnswers <- sum(ifelse(treatmentManipulation$manipulationAnswers==treatmentManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for treatment: 
treatmentManipulation <- treatmentManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(treatmentManipulation))))

## Control manipulation: 
controlManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="control") %>% 
  summarise(n=n()) 
## If user manipulation answers == manipulation answer key then add 1 
controlManipulation$correctAnswers <- sum(ifelse(controlManipulation$manipulationAnswers==controlManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for control: 
controlManipulation <- controlManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(controlManipulation))))
## Proportion test comparing treatment manipulation percent correct with chance: 
```

## 1B: Proportion test comparing treatment manipulation percent correct with chance: 
```{r}
## ⅓ ~ 33% = chance, ~43 = treatment (FILL-IN EACH TIME YOU RUN WITH NEW DATA!)

propManipulation <- prop.test(x = c(33, 43), n = c(100, 100))
print(propManipulation)
```

## 1C: Logistic regression predicting binary fracture outcome from viability scales: 
```{r}
## Split data into 60% training and 40% testing data sets to test how well the model performs 
set.seed(123)
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture) 
sample <- sample(c(TRUE, FALSE), nrow(groupedProportionFracture), replace = T, prob = c(0.6,0.4))
train <- groupedProportionFracture[sample, ]
test <- groupedProportionFracture[!sample, ]

## Simple logistic regression: we will fit a logistic regression model in order to predict 
## the probability of fracture based on a team's average viability sum: 

model1 <- glm(fracture ~ mean, family = "binomial", data = train)
summary(model1)

## To assess the linear regression deviance, look at deviance in summary output, if 
## deviance == sum of sqaures in linear regression, null deviance == difference between 
## a model with only the intercept ("no mean predictors") and the a saturated model 
## a model with a theoretically perfect fit. Model deviance (residual deviance) should be lower 
## small values == better fit. 

tidy(model1)
## Coefficient estimtes from log regression characterize relationship between the predictor and 
## response variable on a log-odds scale, so binary increase from no fracture - fracture can be interpreted as associated with a decrease in mean viability sum. 

## More coefficient output: measure the confidence intervals and accuracy of the coefficent: 
confint(model1)

## Making predictions: 
## What is the probability of fracture given the following team mean viability scores: for example purposes: 40 and 65: 

predict(model1, data.frame(mean = c(50, 65)), type = "response")

## From the output, we can see that the probability of fracture decreases by ~40% when mean viability sum increases from 40 to 65. 

## Model evaluation & diagnostics: 
## How well does the model fit the data? And how accurate are the predictions on an out-of-sample data set?

## Residul assessment: 

model1_data <- augment(model1) %>% 
  mutate(index = 1:n())

ggplot(model1_data, aes(index, .std.resid, color = mean)) + 
  geom_point(alpha = .5) +
  geom_ref_line(h = 3)

## Validation of predicted values: 
## How well does the model perform when predicting the target variable on out-of-sample observations? 

test.predicted.m1 <- predict(model1, newdata = test, type = "response")

## Classification performance for each model on the test data. Output gives us a list of true / false positives: 

list(
  model1 = table(test$mean, test.predicted.m1 > 0.5) %>% prop.table() %>% round(3)) 

table(test$mean, test.predicted.m1 > 0.5)
```
## 1D: Graph of fracture/no fracture vs. mean/stdev viability scale

```{r}
## Overall: 
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 across all conditions (n=2)", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") 

```

```{r}
## By condition + format: 
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 (n=2)", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition) 
```

```{r}
ggplot(data=groupedProportionFracture, aes(fracture, mean)) + 
   geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums across all conditions (n=2)") +  facet_grid(condition ~ results.condition) 
```

```{r}
ggplot(data=groupedProportionFracture, aes(fracture, mean)) + 
   geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums (n=2)") +  facet_grid(condition ~ results.condition) 
```

## Section #2: How often does fracture occur?

## 2A: P(binary fracture) histogram of fracture proportion (by team)
```{r}
## First make sure you've made ABC condition
ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Frequency of team fracture proportions by condition and pattern sequence", 
                                  x="team fracture proportion", y="Count") + facet_grid(results.condition ~ .) 
```
```{r}
ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Frequency of team fracture proportions by condition and pattern sequence", 
                                  x="team fracture proportion", y="Count") + facet_grid(results.condition ~ condition)  
```

```{r}
library(MASS)     

## Test if whether fracture smoking habit is independent of condition at .05 significance level: 

chi = table(groupedProportionFracture$fracture, groupedProportionFracture$results.condition)  
chi
print(chisq.test(chi)) 
```

## 2B: What's the overall % of fracturing the second time? (by condition)
```{r}
## Filtering second time only: 
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)

## TO-DO: review below code, worried it's incorrect: 

groupedProportionFractureSecond <- groupedProportionFracture %>% group_by(results.condition) %>% 
      filter(condition=="Ap") %>% 
      mutate(overallPercent = sum(as.numeric(fracture))) %>% 
      summarise(n=n(), overallPercent=unique(overallPercent)/n) 
print(groupedProportionFractureSecond)

## For baseline theoretical distribution: (to-do: look over this, worried also that it's wrong)

groupedProportionFractureSecondBase <- groupedProportionFractureBaseline %>% 
      filter(round=="3") %>% 
      summarise(n=n(), overallPercent=sum(as.numeric(fracture))/n) 
print(mean(groupedProportionFractureSecondBase$overallPercent))

## Set-up separate groups for proportion tests to answer: does unmasked fracture more/less than masked, and more/less than new pairs? 

## Control: 
## Proportion tests for each (TO-DO, figure out argument stuff: this doesn't seem like the right test to run for the question we're asking?) also, fill-this in every time you get new data: 

## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. treatment + baseline) 
treatmentvsBaselineChange <- c(48, 49)
propFractureChangeTreatmentvsBase <- prop.test(x = c(treatmentvsBaselineChange), n = c(100, 100))
print(propFractureChangeTreatmentvsBase) 

## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. control + baseline) 
controlvsBaselineChange <- c(40, 49)
propFractureChangeControlvsBase <- prop.test(x = c(controlvsBaselineChange), n = c(100, 100))
print(propFractureChangeControlvsBase)
```
## Section 3: How consistent is fracture? 

## 3A: calculating the conditional probabilities of fracture for each condition: 
```{r}

## If fractured the first time, what's the % of fracturing the second time? look at 1 and Ap combination for % in second time: 

## Compare both treatment and control with baseline as theoretical distribution: 

## Conditional probability for masked: 
# of groups who fractured 1 and 2
treatmentFracture12 <- sum(ifelse(treatmentFracture1$fracture=="1" & treatmentFracture2$fracture=="1",1,0))/nrow(treatmentFracture)
# of groups who fractured only 1
treatmentFractureOnly1 <- sum(ifelse(treatmentFracture1$fracture=="1" & treatmentFracture2$fracture=="0",1,0))/nrow(treatmentFracture) 
# of groups who fractured only 2
treatmentFractureOnly2 <- sum(ifelse(treatmentFracture1$fracture=="0" & treatmentFracture2$fracture=="1",1,0))/nrow(treatmentFracture)
# of groups who fractured never        
treatmentFractureNever <- sum(ifelse(treatmentFracture1$fracture=="0" & treatmentFracture2$fracture=="0",1,0))/nrow(treatmentFracture)
conditionalProbTreatment <- matrix(c(treatmentFracture12, treatmentFractureOnly1, treatmentFractureOnly2, treatmentFractureNever),ncol=2,byrow=TRUE)
colnames(conditionalProbTreatment) <- c("Ap frac","Ap no frac") 
rownames(conditionalProbTreatment) <- c("A frac","A no frac")
conditionalProbTreatment <- as.table(conditionalProbTreatment)
conditionalProbTreatment

## Conditional proability for unmasked: 
# of groups who fractured 1 and 2
controlFracture12 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured only 1
controlFractureOnly1 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture) 
# of groups who fractured only 2
controlFractureOnly2 <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured never        
controlFractureNever <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture)
conditionalProbControl <- matrix(c(controlFracture12, controlFractureOnly1, controlFractureOnly2, controlFractureNever),ncol=2,byrow=TRUE)
colnames(conditionalProbControl) <- c("Ap frac","Ap no frac") 
rownames(conditionalProbTreatment) <- c("A frac","A no frac")
conditionalProbControl <- as.table(conditionalProbControl)
conditionalProbControl
```

## 3B: Now investigating the big result: switch %: what's the % of pairs flipping their decisions?

```{r}
## Convert format before merge: *this can be simplified*: 

treatmentFracture2$fracture <- as.numeric(treatmentFracture2$fracture) 
treatmentFracture1$fracture <- as.numeric(treatmentFracture1$fracture) 
treatmentFracture$absFracture <- abs(treatmentFracture2$fracture-treatmentFracture1$fracture)

controlFracture1$fracture <- as.numeric(controlFracture1$fracture)
controlFracture2$fracture <- as.numeric(controlFracture2$fracture)
controlFracture$absFracture <- abs(controlFracture2$fracture-controlFracture1$fracture)

# baselineFracture$fracture1 <- as.numeric(baselineFracture$fracture1) 
# baselineFracture$fracture2 <- as.numeric(baselineFracture$fracture2) 
# baselineFracture$absFracture <- abs(baselineFracture$fracture1-baselineFracture$fracture2)

## Results here seem fishy? Triple check my code for abs/sum? 

treatmentFractureAbsSum <- sum(treatmentFracture$absFracture) 
print(treatmentFractureAbsSum)
controlFractureAbsSum <- sum(controlFracture$absFracture)
print(controlFractureAbsSum)
## baselineFractureAbsSum <- sum(baselineFracture$absFracture) 

## Don't we want a proportion test here before adding / trying chi-squared? 

treatmentAbsChange=c() 
controlAbsChange=c()

## Using baseline as theoretical probability distribution 
## Treatment: 
chisq.test(treatmentAbsChange,p=baselineAbsChange)

## Control:
chisq.test(controlAbsChange,p=baselineAbsChange)
```

## Visualizing change: 

```{r}
## Visualize differences: 
normalizedTreatmentChange <- treatmentFractureAbsSum/nrow(treatmentFracture)
normalizedControlChange <- controlFractureAbsSum/nrow(controlFracture)

dat <- data.frame(changesPerCondition= factor(c("Normalized percentage of absolute fracture change in masked >=.50","Normalized percentage of absolute fracture change in unmasked >=.50"), levels=c("Normalized percentage of absolute fracture change in masked >=.50","Normalized percentage of absolute fracture change in unmasked >=.50")),
                  fractureValueSwitch = c(normalizedTreatmentChange, normalizedControlChange))

p <- ggplot(data=dat, aes(x=changesPerCondition, y=fractureValueSwitch)) +
  geom_bar(stat="identity")
p <- ggplotly(p)
p
```








